import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# -------------------------------------------------
# 1. 读取两套结果
# -------------------------------------------------
file_robust = "Media_6Metrics_DeviationSummary_MAE&Hellinger.xlsx"
file_base   = "Media_6Metrics_DeviationSummary_new.xlsx"

df_r = pd.read_excel(file_robust, engine="openpyxl")
df_b = pd.read_excel(file_base, engine="openpyxl")

# -------------------------------------------------
# 2. 合并（以 Media 为键，确保顺序一致）
# -------------------------------------------------
df = pd.merge(
    df_b[['Media', 'ArithmeticMean']],
    df_r[['Media', 'Mean_MAE&Hellinger']],
    on='Media',
    how='inner'
)

df = df.rename(columns={
    'ArithmeticMean': 'Deviation_Base',
    'Mean_MAE&Hellinger': 'Deviation_Robust'
})

# -------------------------------------------------
# 3. Robustness Check 1：Spearman Rank Correlation
# -------------------------------------------------
rho, pval = spearmanr(df['Deviation_Base'], df['Deviation_Robust'])

print("=== Robustness Check 1: Rank-order stability ===")
print(f"Spearman rho = {rho:.3f}")
print(f"p-value      = {pval:.4g}")
print()

# -------------------------------------------------
# 4. Robustness Check 2：Rank Difference Analysis
# -------------------------------------------------
df['Rank_Base']   = df['Deviation_Base'].rank(ascending=False, method='min')
df['Rank_Robust'] = df['Deviation_Robust'].rank(ascending=False, method='min')

df['Rank_Diff'] = df['Rank_Robust'] - df['Rank_Base']
df['Abs_Rank_Diff'] = df['Rank_Diff'].abs()

print("=== Robustness Check 2: Rank changes ===")
print(f"Max |Rank Diff| : {df['Abs_Rank_Diff'].max():.0f}")
print(f"Mean |Rank Diff|: {df['Abs_Rank_Diff'].mean():.2f}")
print()

# -------------------------------------------------
# 4.1 标记 Extreme outlets（Top 10 / Bottom 10）
# -------------------------------------------------
n_media = len(df)

df['Is_Extreme'] = (
    (df['Rank_Base'] <= 10) |
    (df['Rank_Base'] > n_media - 10)
)

# Extreme vs Non-Extreme 的 rank diff 描述
extreme_stats = df.groupby('Is_Extreme')['Abs_Rank_Diff'].agg(
    ['count', 'mean', 'max']
)

print("=== Rank Difference by Outlet Type ===")
print(extreme_stats)
print()

# -------------------------------------------------
# 5. Robustness Check 3：Quartile Stability
# -------------------------------------------------
df['Q_Base']   = pd.qcut(df['Deviation_Base'], 4, labels=False)
df['Q_Robust'] = pd.qcut(df['Deviation_Robust'], 4, labels=False)

quartile_consistency = (df['Q_Base'] == df['Q_Robust']).mean()

print("=== Robustness Check 3: Quartile consistency ===")
print(f"Same quartile ratio = {quartile_consistency:.2%}")
print()

# -------------------------------------------------
# 6. 输出详细结果（含 Extreme 标记）
# -------------------------------------------------
output_file = "Robustness_Check_DeviationIndex.xlsx"
df.to_excel(output_file, index=False)

print(f"✅ Robustness check completed. Results saved to {output_file}")
